Team A6: Yihan Jia, Yinghao Wang, Simeng Li, Xiangshan Mu, Kaiyu Wang
Business Objectives:
- The objective of the exercise is to find a tradable signal in the dataset.
Project Flow:
- Standard time series models can be used or non-parametric models like neural networks can be deployed. Suggested course of actions is as follows:
- Understand the dataset and what it represents
- Define the problem: what does 'a tradable signal' mean based on the data available (i.e. develop your modeling equation, if any)
- Start with simple analysis to identify patterns and outliers
- Develop models with increasing complexity - i.e. don't start with neural nets etc
- Define your success criteria - i.e. when do you know you have a tradable signal - positive (buy)/negative (sell), both are signals
# imports
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import datetime as dt
from google.colab import drive
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.api import ExponentialSmoothing
from statsmodels.graphics.tsaplots import plot_pacf, plot_acf
from statsmodels.tsa.arima_model import ARIMA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree, neural_network
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_recall_fscore_support
#reset recursionlimit
import sys
sys.setrecursionlimit(10000)
import torch #pytorch
import torch.nn as nn
from torch.autograd import Variable
# supress filter warnings
import warnings
warnings.filterwarnings("ignore")
#plt style
plt.style.use("seaborn")
The dataset contains sectoral data for 3 separate types of investments made in the US (no international funds are included) and represents 60-70% of activity in the overall asset classes for that week (not all investors report data at the same time). The weekly data spans 10 years from 2006 through end-Jan 2017. Not all sectors have data available for all the dates since new investment vehicles are introduced at various points in time.
ReportDate: Weekly data aggregated and released every Wednesday AssetClass: Industry/Sector/Asset Class Flow: Amount of positive (inflow) or negative (outflow) in Millions of USD FlowPct: Flows as percent of assets at beginning of the week AssetsEnd: Assets at end of the week in Millions of USD PortfolioChangePct: Percent change in overall portfolio during the week# read datasets
# try:
# # mount my google drive
# drive.mount('/content/drive')
# # change directory to the path that contains dataset uploaded
# %cd /content/drive/My Drive/Capstone Project
# # read through google drive
# IMF = pd.read_csv('US Sector Inst ETF.csv')
# ETF = pd.read_csv('US Sector Inst ETF.csv')
# RMF = pd.read_csv('US Sector Retail MF.csv')
# except:
# or read through github
ETF = pd.read_csv('https://raw.githubusercontent.com/yinghaow525/MSBA-Projects/main/Capstone%20Project/Datasets/US%20Sector%20Inst%20ETF.csv')
IMF = pd.read_csv('https://raw.githubusercontent.com/yinghaow525/MSBA-Projects/main/Capstone%20Project/Datasets/US%20Sector%20Institutional%20MF.csv')
RMF = pd.read_csv('https://raw.githubusercontent.com/yinghaow525/MSBA-Projects/main/Capstone%20Project/Datasets/US%20Sector%20Retail%20MF.csv')
print(IMF.head(3))
print('-'*50)
print(ETF.head(3))
print('-'*50)
print(RMF.head(3))
# Create type colunm
ETF['Type'] = 'ETF'
IMF['Type'] = 'IMF'
RMF['Type'] = 'RMF'
# Concat 3 dadaset
data = pd.concat([ETF, IMF, RMF], axis=0)
# check NAs
print(data.isna().sum())
# check duplicates
data.duplicated().sum()
# convert report date to pandas datetime object
data['ReportDate'] = pd.to_datetime(data['ReportDate'])
# since all the Asset and Sector values are the same (Equity, and within US)
# we only keep Industry sector as features
data['AssetClass'] = data['AssetClass'].str.split('-')
data['Industry'] = data.AssetClass.apply(lambda x: x[0])
# drop orignal AssetClass feature
data.drop(columns='AssetClass', inplace=True)
# types and number of Industry values
data.Industry.value_counts()
# extract year & month
data['year'], data['month'] = data['ReportDate'].dt.year, data['ReportDate'].dt.month
data.head()
test = data.loc[(data.Type=='ETF') & (data.Industry == 'Energy')][['Flow','ReportDate']]
plt.figure(figsize=(40,3))
plt.plot(test.ReportDate, test.Flow)
plt.title('Fund Flow of Energy Industry with ETF Type')
plt.show()
# max & min value for Flow
print('Max value of Flow is {};'.format(data.Flow.max()), '\n'
'Min value of Flow is {};'.format(data.Flow.min()),'\n'
'Avg value of Flow is {}.'.format(data.Flow.mean()))
# distribution of entire PortfolioChangePct
plt.figure(figsize=(8,5))
sns.distplot(data.PortfolioChangePct, color='lightblue', bins=30, rug=True)
sns.distplot(data.PortfolioChangePct, hist=False)
plt.title("Distribution of PortfolioChangePct")
plt.show()
# plotting PortfolioChangePct distribution by industry on boxplot, we can do via a loop
for _ in data.Industry.value_counts().index:
plt.figure(figsize=(6,4))
sns.boxplot(data.loc[data.Industry == _].PortfolioChangePct)
plt.title(_+' Industry PortfolioChangePct')
plt.show()
# ... or agg into a single plot
data.boxplot(column = 'PortfolioChangePct', by = 'Industry', figsize=(15,10),
rot = 45, fontsize = 10, patch_artist = True)
plt.show()
### outliers: >= |15| ???
# ... or agg into a single plot
# wide version
data.boxplot(column = 'PortfolioChangePct', by = 'Industry', figsize=(80,20),
rot = 45, fontsize = 16, patch_artist = True)
plt.show()
### outliers: >= |15| ???
# group by education
groupby_type = data.groupby('Type').mean()
groupby_type.reset_index(inplace=True)
groupby_type
sns.barplot(groupby_type.Type, groupby_type.Flow, data=groupby_type, palette=["royalblue","cornflowerblue","lightsteelblue"])
plt.title('Average Flow of Different Types of Investments')
plt.show()
groupby_month = data.groupby('month').mean()
groupby_month.reset_index(inplace=True)
groupby_month
sns.barplot(groupby_month.month,groupby_month.Flow, data=groupby_month, color="cornflowerblue")
plt.title('Average Flow of Different Months')
plt.show()
groupby_month = data.groupby('year').mean()
groupby_month.reset_index(inplace=True)
groupby_month
sns.barplot(groupby_month.year,groupby_month.Flow, data=groupby_month, color="cornflowerblue")
plt.title('Average Flow of Different Years')
plt.show()
groupby_indus = data.groupby('Industry').mean()
groupby_indus = groupby_indus[['Flow','FlowPct','PortfolioChangePct','AssetsEnd']]
groupby_indus.T
fig=sns.heatmap(groupby_indus.corr(), cmap="Blues")
fig
list_ind=['Flow','FlowPct','PortfolioChangePct','AssetsEnd']
for i in list_ind:
groupby_indus = data.groupby('Industry').mean()
groupby_indus = groupby_indus[[i]]
groupby_indus.plot(figsize=(16,8))
scale_ls = range(19)
index_ls = ['Commodities/Materials','Consumer Goods','Energy','Financials','Health Care/Biotech','Industrials','Infrastructure','Large Cap Blend','Large Cap Growth','Large Cap Value','Mid Cap Blend','Mid Cap Growth','Mid Cap Value','Real Estate','Small Cap Blend','Small Cap Growth','Small Cap Value','Technology','Telecom','Utilities']
plt.xticks(scale_ls,index_ls,rotation=45)
plt.title('Average ' + i + ' of different industries')
plt.xlabel('industries')
plt.ylabel(i)
plt.legend()
plt.show()
We find four industies have less data records than others, these four industries are: Commodities/Materials, Consumer Goods, Industrials, and Infrastructure.
We want to understand the missing records in 2 ways:
Commodities/Materials:¶# checking missing data for Commodities/Materials, and RMF type has less records
data.loc[data.Industry == 'Commodities/Materials'].Type.value_counts()
# and we figure out that Commodities/Materials has missing RMF records at the beginning of entire timeline
print('The most recent ReportDate for RMF type Commodities/Materials industry is {};'.format(data.loc[(data.Industry == 'Commodities/Materials') &
(data.Type == 'RMF')].ReportDate.max()), '\n'
'The least recent ReportDate for RMF type Commodities/Materials industry is {}.'.format(data.loc[(data.Industry == 'Commodities/Materials') &
(data.Type == 'RMF')].ReportDate.min()),'\n'
'Compare with other typed while same industry funds, their most recent ReportDate is {};'.format(data.loc[(data.Industry == 'Commodities/Materials') &
(data.Type == 'IMF')].ReportDate.max()), '\n'
'and their least recent ReportDate is {}.'.format(data.loc[(data.Industry == 'Commodities/Materials') &
(data.Type == 'IMF')].ReportDate.min())
)
from datetime import datetime
def week_num(start_time, end_time):
# week_start = datetime.strptime(start_time, '%Y-%m-%d')
# week_end = datetime.strptime(end_time, '%Y-%m-%d')
year_week_num = 52
week_end_year = end_time.year
week_start_year = start_time.year
week_end_num = int(datetime.strftime(end_time, '%W'))
week_start_num = int(datetime.strftime(start_time, '%W'))
diff = (week_end_year - week_start_year) * year_week_num + week_end_num - week_start_num
return diff
# calculate that different in weeks for two type's start time is equals to number of missing records
# no other missing data otherwise, no discontinuity exists
week_num(data.loc[(data.Industry == 'Commodities/Materials') & (data.Type == 'IMF')].ReportDate.min(), # start time
data.loc[(data.Industry == 'Commodities/Materials') & (data.Type == 'RMF')].ReportDate.min(), # end time
)
# Commodities/Materials Flow plot for 3 types:
# RMF type has a later start than the other two, and the range is smaller
plt.plot(data.loc[(data.Industry == 'Commodities/Materials') &
(data.Type == 'RMF')].ReportDate,
data.loc[(data.Industry == 'Commodities/Materials') &
(data.Type == 'RMF')].Flow,
label = 'RMF')
plt.title('Flow Plot for Commodities/Materials in RMF Type')
plt.xlabel('Time Range')
plt.ylabel('Flow Range')
plt.show()
# IMF type generally stay within range -500 ~ 500, while has a large outflow reached about -2000 in 2014 May
plt.plot(data.loc[(data.Industry == 'Commodities/Materials') &
(data.Type == 'IMF')].ReportDate,
data.loc[(data.Industry == 'Commodities/Materials') &
(data.Type == 'IMF')].Flow,
label = 'IMF')
plt.title('Flow Plot for Commodities/Materials in IMF Type')
plt.xlabel('Time Range')
plt.ylabel('Flow Range')
plt.show()
# RMF type is very similar to that of IMF
# the similarity exists in trend, and value amount of flow
plt.plot(data.loc[(data.Industry == 'Commodities/Materials') &
(data.Type == 'ETF')].ReportDate,
data.loc[(data.Industry == 'Commodities/Materials') &
(data.Type == 'ETF')].Flow,
label = 'ETF')
plt.title('Flow Plot for Commodities/Materials in ETF Type')
plt.xlabel('Time Range')
plt.ylabel('Flow Range')
plt.show()
# flow amount for IMF
data.loc[(data.Industry == 'Commodities/Materials') &
(data.Type == 'IMF')].Flow
# flow amount for ETF
data.loc[(data.Industry == 'Commodities/Materials') &
(data.Type == 'ETF')].Flow
Consumer Goods:¶# checking missing data for Comsumer Goods
# similarly, this industry also has less records in RMF type
data.loc[data.Industry == 'Consumer Goods'].Type.value_counts()
# and we figure out that Consumer Goods has missing RMF records at the beginning of entire timeline
print('The most recent ReportDate for RMF type Consumer Goods industry is {};'.format(data.loc[(data.Industry == 'Consumer Goods') &
(data.Type == 'RMF')].ReportDate.max()), '\n'
'The least recent ReportDate for RMF type Consumer Goods industry is {}.'.format(data.loc[(data.Industry == 'Consumer Goods') &
(data.Type == 'RMF')].ReportDate.min()),'\n'
'Compare with other typed while same industry funds, their most recent ReportDate is {};'.format(data.loc[(data.Industry == 'Consumer Goods') &
(data.Type == 'IMF')].ReportDate.max()), '\n'
'and their least recent ReportDate is {}.'.format(data.loc[(data.Industry == 'Consumer Goods') &
(data.Type == 'IMF')].ReportDate.min())
)
# calculate that different in weeks for two type's start time is equals to number of missing records
# no other missing data otherwise, no discontinuity exists
week_num(data.loc[(data.Industry == 'Consumer Goods') & (data.Type == 'IMF')].ReportDate.min(), # start time
data.loc[(data.Industry == 'Consumer Goods') & (data.Type == 'RMF')].ReportDate.min(), # end time
)
# Consumer Goods Flow plot for 3 types:
# RMF type has a later start than the other two, and the Flow range is smaller
plt.plot(data.loc[(data.Industry == 'Consumer Goods') &
(data.Type == 'RMF')].ReportDate,
data.loc[(data.Industry == 'Consumer Goods') &
(data.Type == 'RMF')].Flow,
label = 'RMF')
plt.title('Flow Plot for Consumer Goods in RMF Type')
plt.xlabel('Time Range')
plt.ylabel('Flow Range')
plt.show()
# most flow amount is within -800 ~ 1000 range, while there exists extreme drop in the beginning of 2014
plt.plot(data.loc[(data.Industry == 'Consumer Goods') &
(data.Type == 'IMF')].ReportDate,
data.loc[(data.Industry == 'Consumer Goods') &
(data.Type == 'IMF')].Flow,
label = 'IMF')
plt.title('Flow Plot for Consumer Goods in IMF Type')
plt.xlabel('Time Range')
plt.ylabel('Flow Range')
plt.show()
# this time, the RMF type is still very similar to IMF one
plt.plot(data.loc[(data.Industry == 'Consumer Goods') &
(data.Type == 'ETF')].ReportDate,
data.loc[(data.Industry == 'Consumer Goods') &
(data.Type == 'ETF')].Flow,
label = 'ETF')
plt.title('Flow Plot for Consumer Goods in ETF Type')
plt.xlabel('Time Range')
plt.ylabel('Flow Range')
plt.show()
Industrials:¶# checking missing data for Industrials
# similarly, this industry also has less records in RMF type
data.loc[data.Industry == 'Industrials'].Type.value_counts()
# and we figure out that Industrials has missing RMF records at the beginning of entire timeline
print('The most recent ReportDate for RMF type Industrials industry is {};'.format(data.loc[(data.Industry == 'Industrials') &
(data.Type == 'RMF')].ReportDate.max()), '\n'
'The least recent ReportDate for RMF type Industrials industry is {}.'.format(data.loc[(data.Industry == 'Industrials') &
(data.Type == 'RMF')].ReportDate.min()),'\n'
'Compare with other typed while same industry funds, their most recent ReportDate is {};'.format(data.loc[(data.Industry == 'Industrials') &
(data.Type == 'IMF')].ReportDate.max()), '\n'
'and their least recent ReportDate is {}.'.format(data.loc[(data.Industry == 'Industrials') &
(data.Type == 'IMF')].ReportDate.min())
)
# calculate that different in weeks for two type's start time is equals to number of missing records
# no other missing data otherwise, no discontinuity exists
week_num(data.loc[(data.Industry == 'Industrials') & (data.Type == 'IMF')].ReportDate.min(), # start time
data.loc[(data.Industry == 'Industrials') & (data.Type == 'RMF')].ReportDate.min(), # end time
)
# Industrials plot for 3 types:
# RMF type has a much later start than the other two, and the Flow range is smaller
plt.plot(data.loc[(data.Industry == 'Industrials') &
(data.Type == 'RMF')].ReportDate,
data.loc[(data.Industry == 'Industrials') &
(data.Type == 'RMF')].Flow,
label = 'RMF')
plt.title('Flow Plot for Industrials in RMF Type')
plt.xlabel('Time Range')
plt.ylabel('Flow Range')
plt.show()
# most flow amount is within -500 ~ 500 range, while there exists extreme drop in 2014
# and extreme rise in 2017
plt.plot(data.loc[(data.Industry == 'Industrials') &
(data.Type == 'IMF')].ReportDate,
data.loc[(data.Industry == 'Industrials') &
(data.Type == 'IMF')].Flow,
label = 'IMF')
plt.title('Flow Plot for Industrials in IMF Type')
plt.xlabel('Time Range')
plt.ylabel('Flow Range')
plt.show()
# this time, the RMF type is not similar to IMF one,
# the trend is very unstable, very frequent rises and drops during the timeline
plt.plot(data.loc[(data.Industry == 'Industrials') &
(data.Type == 'ETF')].ReportDate,
data.loc[(data.Industry == 'Industrials') &
(data.Type == 'ETF')].Flow,
label = 'ETF')
plt.title('Flow Plot for Industrials in ETF Type')
plt.xlabel('Time Range')
plt.ylabel('Flow Range')
plt.show()
Infrastructure:¶# checking missing data for Infrastructure
# this industry has too few records for visualization
data.loc[data.Industry == 'Infrastructure'].Type.value_counts()
# and we figure out that Industrials has missing RMF records at the beginning of entire timeline
print('The most recent ReportDate for RMF type Industrials industry is {};'.format(data.loc[(data.Industry == 'Infrastructure') &
(data.Type == 'RMF')].ReportDate.max()), '\n'
'The least recent ReportDate for RMF type Industrials industry is {}.'.format(data.loc[(data.Industry == 'Infrastructure') &
(data.Type == 'RMF')].ReportDate.min()),'\n'
'Compare with IMF type while same industry funds, their most recent ReportDate is {};'.format(data.loc[(data.Industry == 'Infrastructure') &
(data.Type == 'IMF')].ReportDate.max()), '\n'
'and their least recent ReportDate is {}.'.format(data.loc[(data.Industry == 'Infrastructure') &
(data.Type == 'IMF')].ReportDate.min()), '\n'
'Compare with ETF type while same industry funds, their most recent ReportDate is {};'.format(data.loc[(data.Industry == 'Infrastructure') &
(data.Type == 'ETF')].ReportDate.max()), '\n'
'and their least recent ReportDate is {}.'.format(data.loc[(data.Industry == 'Infrastructure') &
(data.Type == 'ETF')].ReportDate.min())
)
# Infrastructure plot for 3 types:
# RMF has less than 1 year records
plt.plot(data.loc[(data.Industry == 'Infrastructure') &
(data.Type == 'RMF')].ReportDate,
data.loc[(data.Industry == 'Infrastructure') &
(data.Type == 'RMF')].Flow,
label = 'RMF')
plt.title('Flow Plot for Infrastructure in RMF Type')
plt.xlabel('Time Range')
plt.ylabel('Flow Range')
plt.show()
# IMF has only 4 week records
# and the data exists time-line discountinuity
plt.scatter(x=data.loc[(data.Industry == 'Infrastructure') &
(data.Type == 'IMF')].ReportDate,
y=data.loc[(data.Industry == 'Infrastructure') &
(data.Type == 'IMF')].Flow,
label = 'IMF')
plt.title('Flow Plot for Infrastructure in IMF Type')
plt.xlabel('Time Range')
plt.ylabel('Flow Range')
plt.show()
The moving average can remove the short-term fluctuations of the time series and make the data smooth, so that the trend characteristics of the series can be easily seen.
The chart below shows AssetsEnd data for 4 industry ETF/IMF/RMF stocks along with a 30-week simple moving average. Moving averages smooth out volatility, thus showing long-term volatility trends.
from matplotlib.pyplot import MultipleLocator
dataSMA = data[(data['Industry']=='Health Care/Biotech') & (data['Type']=='ETF')]
dataSMA.sort_index(axis=0,ascending=False,inplace=True)
dataSMA['SMA_3'] = dataSMA['AssetsEnd'].rolling(window=3).mean()
dataSMA['SMA_5'] = dataSMA['AssetsEnd'].rolling(window=5).mean()
dataSMA = dataSMA[['ReportDate','AssetsEnd','SMA_3','SMA_5']]
print(dataSMA)
dataSMA.plot(kind='line', x='ReportDate', y=['AssetsEnd','SMA_3','SMA_5'], color=['lightsteelblue','cornflowerblue','royalblue'])
plt.legend()
plt.xlabel('year')
plt.ylabel('moving average')
plt.title('Simple Moving Average of the Health Care/Biotech Industry ETF Stocks')
y_major_locator=MultipleLocator(10000)
ax=plt.gca()
ax.yaxis.set_major_locator(y_major_locator)
plt.ylim(0,80000)
plt.show()
list_indus = ['Financials','Energy','Health Care/Biotech','Technology']
for i in list_indus:
def SMA(AssetsEnd, n):
return AssetsEnd.rolling(window=n).mean()
def WMA(AssetsEnd, n):
return AssetsEnd.rolling(window=n).apply(lambda x: x[::-1].cumsum().sum() * 2 / n / (n + 1))
def EMA(AssetsEnd, n):
return AssetsEnd.ewm(span=n, min_periods=n).mean()
dataMA1 = data[(data['Industry']==i) & (data['Type']=='ETF')]
dataMA1.sort_index(axis=0,ascending=False,inplace=True)
dataMA1['SMA_30'] = SMA(dataMA1['AssetsEnd'], 30)
dataMA1['WMA_30'] = WMA(dataMA1['AssetsEnd'], 30)
dataMA1['EMA_30'] = EMA(dataMA1['AssetsEnd'], 30)
dataMA1 = dataMA1[['ReportDate','AssetsEnd','SMA_30','WMA_30','EMA_30']]
dataMA1.plot(kind='line', x='ReportDate', y=['AssetsEnd','SMA_30','WMA_30','EMA_30'], figsize=(16, 6), color=['lightsteelblue','lightblue','cornflowerblue','royalblue'])
plt.legend()
plt.xlabel('year')
plt.ylabel('moving average')
plt.title('Moving Average of the ' + i + ' Industry ETF Stocks')
y_major_locator=MultipleLocator(10000)
ax=plt.gca()
ax.yaxis.set_major_locator(y_major_locator)
plt.ylim(0,80000)
plt.show()
for i in list_indus:
def SMA(AssetsEnd, n):
return AssetsEnd.rolling(window=n).mean()
def WMA(AssetsEnd, n):
return AssetsEnd.rolling(window=n).apply(lambda x: x[::-1].cumsum().sum() * 2 / n / (n + 1))
def EMA(AssetsEnd, n):
return AssetsEnd.ewm(span=n, min_periods=n).mean()
dataMA2 = data[(data['Industry']==i) & (data['Type']=='IMF')]
dataMA2.sort_index(axis=0,ascending=False,inplace=True)
dataMA2['SMA_30'] = SMA(dataMA2['AssetsEnd'], 30)
dataMA2['WMA_30'] = WMA(dataMA2['AssetsEnd'], 30)
dataMA2['EMA_30'] = EMA(dataMA2['AssetsEnd'], 30)
dataMA2 = dataMA2[['ReportDate','AssetsEnd','SMA_30','WMA_30','EMA_30']]
dataMA2.plot(kind='line', x='ReportDate', y=['AssetsEnd','SMA_30','WMA_30','EMA_30'], figsize=(16, 6), color=['lightsteelblue','lightblue','cornflowerblue','royalblue'])
plt.legend()
plt.xlabel('year')
plt.ylabel('moving average')
plt.title('Moving Average of the ' + i + ' Industry IMF Stocks')
y_major_locator=MultipleLocator(10000)
ax=plt.gca()
ax.yaxis.set_major_locator(y_major_locator)
plt.ylim(0,80000)
plt.show()
for i in list_indus:
def SMA(AssetsEnd, n):
return AssetsEnd.rolling(window=n).mean()
def WMA(AssetsEnd, n):
return AssetsEnd.rolling(window=n).apply(lambda x: x[::-1].cumsum().sum() * 2 / n / (n + 1))
def EMA(AssetsEnd, n):
return AssetsEnd.ewm(span=n, min_periods=n).mean()
dataMA3 = data[(data['Industry']==i) & (data['Type']=='RMF')]
dataMA3.sort_index(axis=0,ascending=False,inplace=True)
dataMA3['SMA_30'] = SMA(dataMA3['AssetsEnd'], 30)
dataMA3['WMA_30'] = WMA(dataMA3['AssetsEnd'], 30)
dataMA3['EMA_30'] = EMA(dataMA3['AssetsEnd'], 30)
dataMA3 = dataMA3[['ReportDate','AssetsEnd','SMA_30','WMA_30','EMA_30']]
dataMA3.plot(kind='line', x='ReportDate', y=['AssetsEnd','SMA_30','WMA_30','EMA_30'], figsize=(16, 6), color=['lightsteelblue','lightblue','cornflowerblue','royalblue'])
plt.legend()
plt.xlabel('year')
plt.ylabel('moving average')
plt.title('Moving Average of the ' + i + ' Industry RMF Stocks')
y_major_locator=MultipleLocator(10000)
ax=plt.gca()
ax.yaxis.set_major_locator(y_major_locator)
plt.ylim(0,80000)
plt.show()
Regarding continuous compound returns, we can get the annual cumulative return by log-cumulative method.
data['AssetsStart'] = data['AssetsEnd'] - data['Flow']
data.sample(3)
list_year = [2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017]
for i in list_indus:
def funRMF(year):
data_yHB = data[(data['year']==year ) & (data['Industry']==i) & (data['Type']=='ETF')]
data_yHB.sort_index(axis=0,ascending=False,inplace=True)
#take log return
dataHB = data_yHB.loc[:,['ReportDate','AssetsStart','AssetsEnd']]
dataHB['log_price'] = np.log(dataHB['AssetsEnd'])
dataHB['log_return'] = dataHB.log_price.diff()
year_returnHB = dataHB.log_return.sum()
return(year_returnHB)
year_returnHB = [funRMF(i) for i in list_year]
plt.plot(list_year, year_returnHB, color = 'cornflowerblue', label = 'year return')
plt.legend()
plt.xlabel('year')
plt.ylabel('year_retunHB')
plt.title('Year Returns for the ' + i + ' Industry ETF stocks')
plt.axhline(y=0, color='grey', linestyle='--')
plt.ylim(-1,2)
y_major_locator=MultipleLocator(0.2)
ax=plt.gca()
ax.yaxis.set_major_locator(y_major_locator)
plt.show()
list_year = [2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017]
for i in list_indus:
def funRMF(year):
data_yHB = data[(data['year']==year ) & (data['Industry']==i) & (data['Type']=='IMF')]
data_yHB.sort_index(axis=0,ascending=False,inplace=True)
#take log return
dataHB = data_yHB.loc[:,['ReportDate','AssetsStart','AssetsEnd']]
dataHB['log_price'] = np.log(dataHB['AssetsEnd'])
dataHB['log_return'] = dataHB.log_price.diff()
year_returnHB = dataHB.log_return.sum()
return(year_returnHB)
year_returnHB = [funRMF(i) for i in list_year]
plt.plot(list_year, year_returnHB, color = 'cornflowerblue', label = 'year return')
plt.legend()
plt.xlabel('year')
plt.ylabel('year_retunHB')
plt.title('Year Returns for the ' + i + ' Industry IMF stocks')
plt.axhline(y=0, color='grey', linestyle='--')
plt.ylim(-1,2)
y_major_locator=MultipleLocator(0.2)
ax=plt.gca()
ax.yaxis.set_major_locator(y_major_locator)
plt.show()
list_year = [2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017]
for i in list_indus:
def funRMF(year):
data_yHB = data[(data['year']==year ) & (data['Industry']==i) & (data['Type']=='RMF')]
data_yHB.sort_index(axis=0,ascending=False,inplace=True)
#take log return
dataHB = data_yHB.loc[:,['ReportDate','AssetsStart','AssetsEnd']]
dataHB['log_price'] = np.log(dataHB['AssetsEnd'])
dataHB['log_return'] = dataHB.log_price.diff()
year_returnHB = dataHB.log_return.sum()
return(year_returnHB)
year_returnHB = [funRMF(i) for i in list_year]
plt.plot(list_year, year_returnHB, color = 'cornflowerblue', label = 'year return')
plt.legend()
plt.xlabel('year')
plt.ylabel('year_retunHB')
plt.title('Year Returns for the ' + i + ' Industry RMF stocks')
plt.axhline(y=0, color='grey', linestyle='--')
plt.ylim(-1,2)
y_major_locator=MultipleLocator(0.2)
ax=plt.gca()
ax.yaxis.set_major_locator(y_major_locator)
plt.show()
Local Optimization
Here we generate 3 (MA, SES & LSTM) models for each of 48 unique combination of fund type and fund industry, we aim at minimizing mse and reach a relatively high accuracy on predicting the tradable signal in terms of flow amount.
Among the four models, MA and SES estimations are simple to understand, fast to execute and return with relatively more accurate prediction results.
# drop all rows where industry == Infrastructure
df = data.drop(data[data['Industry'] == 'Infrastructure'].index)
data.Type.unique()
data.Industry.unique()
# exclude industries with missing data observations
industries = ['Energy', 'Financials',
'Health Care/Biotech', 'Large Cap Blend',
'Large Cap Growth', 'Large Cap Value', 'Mid Cap Blend',
'Mid Cap Growth', 'Mid Cap Value', 'Real Estate',
'Small Cap Blend', 'Small Cap Growth', 'Small Cap Value',
'Technology', 'Telecom', 'Utilities']
len(industries)
All subsets of flow (with different types and industries) are tested to be stastionary.
#Ho: It is non-stationary
#H1: It is stationary
# self define func
def adfuller_test(flow):
result = adfuller(flow)
labels = ['ADF Test Statistic','p-value','#Lags Used','Number of Observations']
for value,label in zip(result,labels):
print(label+' : '+str(value) )
if result[1] <= 0.05:
print("Strong evidence against the null hypothesis(Ho), reject the null hypothesis. Data is stationary.")
else:
print("weak evidence against null hypothesis,indicating it is non-stationary.")
# all sub flows are stationary
for t in data.Type.unique():
# iterating for 16 industries
for i in industries:
print('Stationary check for type {} in {} industry:'.format(t, i))
temp = data.loc[(data.Type == t) & (data.Industry == i)][['ReportDate', 'Flow']]
temp_test = temp['Flow']
adfuller_test(temp_test)
print('-'*50)
from sklearn.metrics import mean_squared_error
# initialize empty dataframe to store moving avgerage estimation results
df_ma = pd.DataFrame()
# iterating for 3 types
for t in data.Type.unique():
# iterating for 16 industries
for i in industries:
# ma window
for n in range(1, 21):
sub = data.loc[(data.Type == t) & (data.Industry == i)][['ReportDate', 'Flow']]
# no burn-in period assumed, predict the last 28 observations
mse = mean_squared_error(sub["Flow"][550:], sub["Flow"].rolling(n).mean()[549:578])
df_ma = df_ma.append({'Type': t,
'Industry': i,
'MA window': n,
'MA_mse': mse}, ignore_index = True
)
# print('With window = {}, the corresponding mse in {} Industry with {} Type is {}.'.format(n, i, t, mse))
# groupby mse values by Type and Industries, and sort values by mse
df_ma = df_ma.groupby(['Type','Industry']).apply(lambda x: x.sort_values(["MA_mse"], ascending = True)).reset_index(drop=True)
# store best result
df_ma_final = pd.DataFrame()
for i in range(0, 960, 20):
df_ma_final = df_ma_final.append(df_ma.iloc[i])
# reset index
df_ma_final.reset_index(inplace=True)
# remove first column
df_ma_final = df_ma_final.iloc[:, 1:]
# ma window as int
df_ma_final['MA window'] = df_ma_final['MA window'].astype('int')
df_ma_final
# iterating for flow & MA window saved in above dataframe
for i in range(df_ma_final.shape[0]):
# subset
sub = data.loc[(data.Type == df_ma_final.iloc[i].Type) & (data.Industry == df_ma_final.iloc[i].Industry)][['ReportDate', 'Flow']]
# reorder sequence
sub.sort_values('ReportDate', ascending = True, inplace=True)
# train & test split
train = sub.iloc[:551]
test = sub.iloc[550:]
# predict with best ma window
pred_ma = test.Flow.rolling(df_ma_final.iloc[i]['MA window'], min_periods = 1).mean()
# plot
plt.figure(figsize=(40,3))
plt.plot(train.ReportDate, train.Flow, c = 'b', label = 'Train')
plt.plot(test.ReportDate, test.Flow, c = 'orange', label = 'Test')
ma_window = "MA window" + str(df_ma_final.iloc[i]['MA window'])
plt.plot(test.ReportDate, pred_ma, c = 'grey', label = ma_window)
plt.title('Moving Average with Window = {} for {} of {} Type'.format(df_ma_final.iloc[i]['MA window'], df_ma_final.iloc[i]['Industry'], df_ma_final.iloc[i]['Type']))
plt.xlabel('Report Date')
plt.ylabel('Flow')
plt.legend()
plt.show()
# if needed to store output
# df_ma_final.to_csv('moving_avg_output.csv')
df_ses = pd.DataFrame()
# iterating for 3 types
for t in data.Type.unique():
# iterating for 16 industries
for i in industries:
# ma window
sub = data.loc[(data.Type == t) & (data.Industry == i)][['ReportDate', 'Flow']]
for alpha in np.arange(0.1, 1.1, 0.1):
# calculating mse
mse = mean_squared_error(sub.Flow[550:],ExponentialSmoothing(sub.Flow).fit(smoothing_level = alpha).predict(550,))
df_ses = df_ses.append({'Type': t,
'Industry': i,
'SES alpha': alpha,
'SES mse': mse}, ignore_index = True
)
df_ses
# groupby mse values by Type and Industries, and sort values by mse
df_ses = df_ses.groupby(['Type','Industry']).apply(lambda x: x.sort_values(["SES mse"], ascending = True)).reset_index(drop=True)
# store best result
df_ses_final = pd.DataFrame()
for i in range(0, 480, 10):
df_ses_final = df_ses_final.append(df_ses.iloc[i])
df_ses_final.reset_index(inplace=True)
df_ses_final = df_ses_final.iloc[:, 1:]
df_ses_final
# iterating for flow & SES smoothing factor saved in above dataframe
for i in range(df_ses_final.shape[0]):
# subset
sub = data.loc[(data.Type == df_ses_final.iloc[i].Type) & (data.Industry == df_ses_final.iloc[i].Industry)][['ReportDate', 'Flow']]
# reorder sequence
sub.sort_values('ReportDate', ascending = True, inplace=True)
# train & test split
train = sub.iloc[:551]
test = sub.iloc[550:]
# predict with best ses alpha
pred_ses = ExponentialSmoothing(sub.Flow).fit(smoothing_level = df_ses_final.iloc[i]['SES alpha']).predict(550,)
# plot
plt.figure(figsize=(40,3))
plt.plot(train.ReportDate, train.Flow, c = 'b', label = 'Train')
plt.plot(test.ReportDate, test.Flow, c = 'orange', label = 'Test')
ses_alpha = "SES smoothing factor" + str(df_ses_final.iloc[i]['SES alpha'])
plt.plot(test.ReportDate, pred_ses, c = 'grey', label = ses_alpha)
plt.title('Simple Exponential Smoothing with alpha = {} for {} of {} Type'.format(df_ses_final.iloc[i]['SES alpha'], df_ses_final.iloc[i]['Industry'], df_ses_final.iloc[i]['Type']))
plt.xlabel('Report Date')
plt.ylabel('Flow')
plt.legend()
plt.show()
# if needed to store SES result
#df_ses_final.to_csv('ses_output.csv')
# compare two model result
comparision = df_ma_final.copy()
comparision[['SES alpha', 'SES_mse']] = df_ses_final[['SES alpha', 'SES mse']]
# comparision.drop(columns='SES mse', inplace=True)
comparision
comparision['better model'] = np.where(comparision['MA_mse'] > comparision['SES_mse'], 'SES', 'MA')
comparision
# overall MA models generate a better performance in predicting short-term flow change
comparision['better model'].value_counts()
# plot
comparision['better model'].value_counts().plot(kind = 'bar')
plt.title('Assess MA* vs. SES*')
plt.show()
# acf & pacf plot
# iterating for 3 types
for t in data.Type.unique():
# iterating for 16 industries
for i in industries:
print('With fund flow in {} Industry & {} Type:'.format(i, t))
sub = data.loc[(data.Type == t) & (data.Industry == i)][['ReportDate', 'Flow']]
fig, ax = plt.subplots(2, figsize=(12,6))
ax[0] = plot_acf(sub.Flow, ax=ax[0], lags=20)
ax[1] = plot_pacf(sub.Flow, ax=ax[1], lags=20)
plt.xlabel('Lag')
plt.show()
print('-'*50)
df_arma = pd.DataFrame()
## due to stats lib updates, ARMA is not available
from statsmodels.tsa.arima_model import ARIMA
# iterating for 3 types
for t in data.Type.unique():
# iterating for 16 industries
for i in industries:
# subset
sub = data.loc[(data.Type == t) & (data.Industry == i)][['ReportDate', 'Flow']]
# train & test split
train = sub.Flow[:551]
test = sub.Flow[550:]
# ar param
for p in range(4):
# ma param
for q in range(1, 4):
# in case model order not applicable
try:
# initialize model
model = ARIMA(train, order = (p, 0, q))
# fit
model_fit = model.fit()
# model performance = aic + bic, the smaller the better
res = model_fit.aic + model_fit.bic
df_arma = df_arma.append({'Type': t,
'Industry': i,
'order': (p, q),
'performance': res}, ignore_index = True)
except:
continue
# print('With window = {}, the corresponding mse in {} Industry with {} Type is {}.'.format(n, i, t, mse))
# groupby mse values by Type and Industries, and sort values by mse
df_arma = df_arma.groupby(['Type','Industry']).apply(lambda x: x.sort_values(["performance"], ascending = True)).reset_index(drop=True)
# store best result
df_arma_final = pd.DataFrame()
# iterating for 3 types
for t in data.Type.unique():
# iterating for 16 industries
for i in industries:
# subset
best = df_arma.loc[(df_arma.Type == t) & (df_arma.Industry == i)].iloc[0]
df_arma_final = df_arma_final.append(best)
df_arma_final
# visualizing
for row in range(df_arma_final.shape[0]):
# extract column factors
industry = df_arma_final.iloc[row].Industry
ftype = df_arma_final.iloc[row].Type
order = list(df_arma_final.iloc[row].order)
# subset data
sub = data.loc[(data.Type == ftype) & (data.Industry == industry)][['ReportDate', 'Flow']]
# reorder sequence
sub.sort_values('ReportDate', ascending = True, inplace=True)
# train & test split
train = sub[:551]
test = sub[550:]
# initialize
model = ARIMA(train.Flow, order = (order[0], 0, order[1]))
# fit
try:
model_fit = model.fit()
# predict
predict = model_fit.predict(start = 550, end = 578)
# plot
plt.figure(figsize= (40, 3))
plt.plot(train.ReportDate, train.Flow, c = 'b', label = 'Train')
plt.plot(test.ReportDate, test.Flow, c='orange', label = 'Test')
plt.plot(test.ReportDate, predict, c='grey', label = order)
plt.title('ARMA with order = {} for {} of {} Type'.format(order, industry, ftype))
plt.xlabel('Report Date')
plt.ylabel('Flow')
plt.legend()
plt.show()
except:
continue
# print one subset mse for presentation comparision
# subset: ETF Energy
temp = data.loc[(data.Type == 'ETF') & (data.Industry == 'Energy')][['ReportDate', 'Flow']]
t_train = temp[:551]
t_test = temp[550:]
model = ARIMA(t_train.Flow, order = (0,0,1))
predict = model_fit.predict(start = 550, end = 578)
mean_squared_error(t_test.Flow, predict)
LSTM_ETF=data[data['Type']== 'ETF']
LSTM_ETF.head(3)
LSTM_ETF_energy=LSTM_ETF[LSTM_ETF['Industry']=='Energy']
LSTM_ETF_energy.head(3)
LSTM_ETF_energy = LSTM_ETF_energy.sort_values(by='ReportDate', ascending=True)
LSTM_ETF_energy.head(3)
input_data=LSTM_ETF_energy[['FlowPct','AssetsEnd','PortfolioChangePct','Flow']]
input_data=input_data.reset_index(drop=True)
input_data.head(3)
X = input_data.iloc[:, :-1]
y = input_data.iloc[:, 3:4]
from sklearn.preprocessing import StandardScaler, MinMaxScaler
mm = MinMaxScaler()
ss = StandardScaler()
X_ss = ss.fit_transform(X)
y_mm = mm.fit_transform(y)
#first 550 for training
X_train = X_ss[:550, :]
X_test = X_ss[550:, :]
y_train = y_mm[:550, :]
y_test = y_mm[550:, :]
print("Training Shape", X_train.shape, y_train.shape)
print("Testing Shape", X_test.shape, y_test.shape)
X_train_tensors = Variable(torch.Tensor(X_train))
X_test_tensors = Variable(torch.Tensor(X_test))
y_train_tensors = Variable(torch.Tensor(y_train))
y_test_tensors = Variable(torch.Tensor(y_test))
#reshaping to rows, timestamps, features
X_train_tensors_final = torch.reshape(X_train_tensors, (X_train_tensors.shape[0], 1, X_train_tensors.shape[1]))
X_test_tensors_final = torch.reshape(X_test_tensors, (X_test_tensors.shape[0], 1, X_test_tensors.shape[1]))
print("Training Shape", X_train_tensors_final.shape, y_train_tensors.shape)
print("Testing Shape", X_test_tensors_final.shape, y_test_tensors.shape)
class LSTM1(nn.Module):
def __init__(self, num_classes, input_size, hidden_size, num_layers, seq_length):
super(LSTM1, self).__init__()
self.num_classes = num_classes #number of classes
self.num_layers = num_layers #number of layers
self.input_size = input_size #input size
self.hidden_size = hidden_size #hidden state
self.seq_length = seq_length #sequence length
self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size,
num_layers=num_layers, batch_first=True) #lstm
self.fc_1 = nn.Linear(hidden_size, 128) #fully connected 1
self.fc = nn.Linear(128, num_classes) #fully connected last layer
self.relu = nn.ReLU()
def forward(self,x):
h_0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)) #hidden state
c_0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)) #internal state
# Propagate input through LSTM
output, (hn, cn) = self.lstm(x, (h_0, c_0)) #lstm with input, hidden, and internal state
hn = hn.view(-1, self.hidden_size) #reshaping the data for Dense layer next
out = self.relu(hn)
out = self.fc_1(out) #first Dense
out = self.relu(out) #relu
out = self.fc(out) #Final Output
return out
num_epochs = 1000 #1000 epochs
learning_rate = 0.001 #0.001 lr
input_size = 3 #number of features
hidden_size = 2 #number of features in hidden state
num_layers = 1 #number of stacked lstm layers
num_classes = 1 #number of output classes
lstm1 = LSTM1(num_classes, input_size, hidden_size, num_layers, X_train_tensors_final.shape[1]) #our lstm class
criterion = torch.nn.MSELoss() # mean-squared error for regression
optimizer = torch.optim.Adam(lstm1.parameters(), lr=learning_rate)
for epoch in range(num_epochs):
outputs = lstm1.forward(X_train_tensors_final) #forward pass
optimizer.zero_grad() #caluclate the gradient, manually setting to 0
# obtain the loss function
loss = criterion(outputs, y_train_tensors)
loss.backward() #calculates the loss of the loss function
optimizer.step() #improve from loss, i.e backprop
if epoch % 100 == 0:
print("Epoch: %d, loss: %1.5f" % (epoch, loss.item()))
df_X_ss = ss.transform(input_data.iloc[:, :-1]) #old transformers
df_y_mm = mm.transform(input_data.iloc[:, -1:]) #old transformers
df_X_ss = Variable(torch.Tensor(df_X_ss)) #converting to Tensors
df_y_mm = Variable(torch.Tensor(df_y_mm))
#reshaping the dataset
df_X_ss = torch.reshape(df_X_ss, (df_X_ss.shape[0], 1, df_X_ss.shape[1]))
train_predict = lstm1(df_X_ss)#forward pass
data_predict = train_predict.data.numpy() #numpy conversion
dataY_plot = df_y_mm.data.numpy()
data_predict = mm.inverse_transform(data_predict) #reverse transformation
dataY_plot = mm.inverse_transform(dataY_plot)
plt.figure(figsize=(40,3)) #plotting
plt.plot(dataY_plot, label='Actuall Data', c = 'b') #actual plot
plt.plot(range(550, 550+len(data_predict[550:])),data_predict[550:], label='Predicted Data',c='orange') #predicted plot
plt.title('LSTM model for Energy of ETF Type')
plt.xlabel('Report Date')
plt.ylabel('Flow')
plt.legend()
plt.show()
# ETF (Exchange Traded Funds - insitutional investors)
ETF = pd.read_csv('https://raw.githubusercontent.com/yinghaow525/MSBA-Projects/main/Capstone%20Project/Datasets/US%20Sector%20Inst%20ETF.csv')
# Institutional Mutual Fund Holdings
IMF = pd.read_csv('https://raw.githubusercontent.com/yinghaow525/MSBA-Projects/main/Capstone%20Project/Datasets/US%20Sector%20Institutional%20MF.csv')
# Retail Mutual Fund Holdings (investments made by individuals in their portfolios
RMF = pd.read_csv('https://raw.githubusercontent.com/yinghaow525/MSBA-Projects/main/Capstone%20Project/Datasets/US%20Sector%20Retail%20MF.csv')
ETF.head(3)
# Create type colunm
ETF['Type'] = 'ETF'
IMF['Type'] = 'IMF'
RMF['Type'] = 'RMF'
# Concat 3 dadaset
data = pd.concat([ETF, IMF, RMF], axis=0)
# convert report date to pandas datetime object
data['ReportDate'] = pd.to_datetime(data['ReportDate'])
# since all the Asset and Sector values are the same (Equity, and within US)
# we only keep Industry sector as features
data['AssetClass'] = data['AssetClass'].str.split('-')
data['Industry'] = data.AssetClass.apply(lambda x: x[0])
# drop orignal AssetClass feature
data.drop(columns='AssetClass', inplace=True)
# types and number of Industry values
data.Industry.value_counts()
data.head(3)
data = data.sort_values(by=['ReportDate'])
# extract year & month
data['year'], data['month'] = data['ReportDate'].dt.year, data['ReportDate'].dt.month
data.head(12)
data.head(3)
data.shape
data = data.sort_values(by=['Type', 'Industry','year','month','ReportDate'])
data = data.reset_index(drop=True)
# 删除符合条件的指定行,并替换原始df
data.drop(data[(data.year == 2017) & (data.month == 2)].index, inplace=True)
data.drop(data[data.Industry == 'Infrastructure'].index, inplace=True)
data.drop(data[(data.year == 2006) & (data.month == 9) & (data.Industry == 'Commodities/Materials') & (data.Type == 'RMF')].index, inplace=True)
data.drop(data[(data.year == 2006) & (data.month == 9) & (data.Industry == 'Consumer Goods') & (data.Type == 'RMF')].index, inplace=True)
data.drop(data[(data.year == 2011) & (data.month == 10) & (data.Industry == 'Industrials') & (data.Type == 'RMF')].index, inplace=True)
data.shape
def converter(x):
return pd.Series({
'Flow':sum(x.Flow.unique()),
'FlowPct1':x.FlowPct[:1].unique()[0],
'FlowPct4':x.FlowPct.unique()[-1],
'AssetsEnd1':x.AssetsEnd[:1].unique()[0],
'AssetsEnd4':x.AssetsEnd.unique()[-1],
'PortfolioChangePct1':x.PortfolioChangePct[:1].unique()[0],
# 'PortfolioChangePct2':x.PortfolioChangePct.unique()[:, 1],
'PortfolioChangePct4':x.PortfolioChangePct.unique()[-1],
'Type':x.Type[:1].unique()[0],
'Industry':x.Industry[:1].unique()[0],
'year':x.year[:1].unique()[0],
'month':x.month[:1].unique()[0]
}
)
# def converter(x):
# return pd.Series({
# 'Flow':sum(x.Flow.unique()),
# 'FlowPct1':x.FlowPct.unique()[0],
# 'FlowPct2':x.FlowPct.unique()[1],
# 'FlowPct3':x.FlowPct.unique()[2],
# 'FlowPct4':x.FlowPct.unique()[-1],
# 'AssetsEnd1':x.AssetsEnd[:3].unique()[0],
# 'AssetsEnd2':x.AssetsEnd[:3].unique()[1],
# 'AssetsEnd3':x.AssetsEnd[:3].unique()[2],
# 'AssetsEnd4':x.AssetsEnd.unique()[-1],
# 'PortfolioChangePct1':x.PortfolioChangePct[:3].unique()[0],
# 'PortfolioChangePct2':x.PortfolioChangePct[:3].unique()[1],
# 'PortfolioChangePct3':x.PortfolioChangePct[:3].unique()[2],
# 'PortfolioChangePct4':x.PortfolioChangePct.unique()[-1],
# 'Type':x.Type[:1].unique()[0],
# 'Industry':x.Industry[:1].unique()[0],
# 'year':x.year[:1].unique()[0],
# 'month':x.month[:1].unique()[0]
# }
# )
group_filter = ['Industry','Type' , 'year', 'month']
data2=data.groupby(group_filter).apply(converter)
data2['Flow'] = data2['Flow'].shift(-1)
data2.rename(columns = {'Flow':'NextMonthFlow'}, inplace = True)
data2
d = data.groupby(group_filter,as_index=False,sort=False).aggregate(lambda x: ','.join(map(str, x)))
for i in range(len(d)):
if len(d.FlowPct[i].split(',')) < 4:
print(i)
d.FlowPct[135]
FlowPct2 = []
FlowPct3 = []
AssetsEnd2 = []
AssetsEnd3 = []
PortfolioChangePct2 = []
PortfolioChangePct3 = []
for i in range(len(d)):
FlowPct2.append(float(d.FlowPct[i].split(',')[1]))
FlowPct3.append(float(d.FlowPct[i].split(',')[2]))
AssetsEnd2.append(float(d.AssetsEnd[i].split(',')[1]))
AssetsEnd3.append(float(d.AssetsEnd[i].split(',')[2]))
PortfolioChangePct2.append(float(d.PortfolioChangePct[i].split(',')[1]))
PortfolioChangePct3.append(float(d.PortfolioChangePct[i].split(',')[2]))
data2['FlowPct2'] = FlowPct2
data2['FlowPct3'] = FlowPct3
data2['AssetsEnd2'] = AssetsEnd2
data2['AssetsEnd3'] = AssetsEnd3
data2['PortfolioChangePct2'] = PortfolioChangePct2
data2['PortfolioChangePct3'] = PortfolioChangePct3
data2
data2 = data2[['FlowPct1','FlowPct2','FlowPct3','FlowPct4','AssetsEnd1','AssetsEnd2','AssetsEnd3','AssetsEnd4','PortfolioChangePct1','PortfolioChangePct2','PortfolioChangePct3','PortfolioChangePct4','Type','Industry','year','month','NextMonthFlow']]
data2
data2['Inflow'] = np.where((data2.AssetsEnd4 - data2.AssetsEnd1) > 0, int(1), int(0))
data2.drop(data2.tail(1).index,inplace=True)
# Check the correlation of the features
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(15,10))
corr = data2.corr()
sns.heatmap(corr,linewidths=.2, cmap="YlGnBu",
annot=True,
fmt=".2f",
annot_kws={'size':8,'weight':'normal', 'color':'#253D24'})
import matplotlib.pyplot as plt
data2 = pd.get_dummies(data2)
data2 = data2.reset_index(drop=True)
data2
# Check outliers
# Plot the boxplot od ratio features which are within the same scale
import matplotlib.pyplot as plt
plt.figure(figsize=(15,10))
data2.boxplot(column= [i for i in data2[['FlowPct1', 'FlowPct2','FlowPct3','FlowPct4',
'PortfolioChangePct1', 'PortfolioChangePct2', 'PortfolioChangePct3', 'PortfolioChangePct4']].columns.tolist()]);
# plt.ylim([-50, 100])
# 'AssetsEnd1', 'AssetsEnd4',
# Winsorizing the ratioa variables at 1% and 99% values
from scipy.stats.mstats import winsorize
data2['FlowPct1'] = winsorize(data2['FlowPct1'], limits = [0.01,0.01])
data2['FlowPct2'] = winsorize(data2['FlowPct2'], limits = [0.01,0.01])
data2['FlowPct3'] = winsorize(data2['FlowPct3'], limits = [0.01,0.01])
data2['FlowPct4'] = winsorize(data2['FlowPct4'], limits = [0.01,0.01])
data2['PortfolioChangePct1'] = winsorize(data2['PortfolioChangePct1'], limits = [0.01,0.01])
data2['PortfolioChangePct2'] = winsorize(data2['PortfolioChangePct2'], limits = [0.01,0.01])
data2['PortfolioChangePct3'] = winsorize(data2['PortfolioChangePct3'], limits = [0.01,0.01])
data2['PortfolioChangePct4'] = winsorize(data2['PortfolioChangePct4'], limits = [0.01,0.01])
# Check outliers
# Plot the boxplot od ratio features which are within the same scale
import matplotlib.pyplot as plt
plt.figure(figsize=(10,6))
data2.boxplot(column= [i for i in data2[['AssetsEnd1', 'AssetsEnd2', 'AssetsEnd3', 'AssetsEnd4']].columns.tolist()]);
# plt.ylim([-50, 100])
# There are many outliers and the data is right-skewed.
plt.figure(figsize=(10,6))
sns.distplot(data2['AssetsEnd1'], bins = 15, kde = False)
fig = sns.distplot(data2['AssetsEnd2'], bins = 15, kde = False)
fig = sns.distplot(data2['AssetsEnd3'], bins = 15, kde = False)
fig = sns.distplot(data2['AssetsEnd4'], bins = 15, kde = False)
fig.legend(labels = ['AssetsEnd1','AssetsEnd2','AssetsEnd3','AssetsEnd4'])
# Here we need to use Log transformation to transform data to normal or close to normal
import numpy as np
AssetsEnd1_log = np.log(data2['AssetsEnd1'])
AssetsEnd1_log = np.log(data2['AssetsEnd2'])
AssetsEnd1_log = np.log(data2['AssetsEnd3'])
AssetsEnd4_log = np.log(data2['AssetsEnd4'])
data2['AssetsEnd1_log'] = AssetsEnd1_log
data2['AssetsEnd2_log'] = AssetsEnd1_log
data2['AssetsEnd3_log'] = AssetsEnd1_log
data2['AssetsEnd4_log'] = AssetsEnd4_log
data2 = data2.drop(['AssetsEnd1', 'AssetsEnd2','AssetsEnd3','AssetsEnd4'], axis=1)
# There are many outliers and the data is right-skewed.
plt.figure(figsize=(10,6))
sns.distplot(data2['AssetsEnd1_log'], bins = 15, kde = False)
fig = sns.distplot(data2['AssetsEnd2_log'], bins = 15, kde = False)
fig = sns.distplot(data2['AssetsEnd3_log'], bins = 15, kde = False)
fig = sns.distplot(data2['AssetsEnd4_log'], bins = 15, kde = False)
fig.legend(labels = ['AssetsEnd1_log','AssetsEnd2_log','AssetsEnd3_log','AssetsEnd4_log'])
data2.shape
label = data2[['NextMonthFlow']]
data2 = data2.drop(['NextMonthFlow'],axis=1)
import seaborn as sns
sns.histplot(label['NextMonthFlow'])
plt.ylim((0,600))
classes = []
for i in label['NextMonthFlow'].values:
if i < -100 :
classes.append('-1')
elif i >=-100 and i<=100:
classes.append('0')
elif i >100:
classes.append('1')
label['classes'] = classes
label
print('The number of Flow located within (,-100) is'.format(), len(label.loc[label['NextMonthFlow']<-100] == True))
print('The number of Flow located within [-100,100] is'.format(),len(label.loc[(label['NextMonthFlow']>=-100) & (label['NextMonthFlow']<=100)] == True))
print('The number of Flow located within (100,) is'.format(),len(label.loc[(label['NextMonthFlow']>100) & (label['NextMonthFlow']<25000)] == True))
pd.value_counts(label['classes'])
# Check the correlation of the features
import seaborn as sns
plt.figure(figsize=(20,15))
corr = data2.corr()
sns.heatmap(corr,linewidths=.2, cmap="YlGnBu",
annot=True,
fmt=".2f",
annot_kws={'size':8,'weight':'normal', 'color':'#253D24'})
def df_norm(df, cols):
df_n = df[cols]
for col in cols:
ma = df_n[col].max()
mi = df_n[col].min()
df_n[str(col) + '_n'] = (df_n[col] - mi) / (ma - mi)
return df_n
data2['year'] = 2022 - data2['year']
data2
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(np.array(data2), np.array(label['classes']), test_size=0.2, random_state=888)
models=[KNeighborsClassifier(),
LogisticRegression(penalty='l2'),
RandomForestClassifier(n_estimators=15),
tree.DecisionTreeClassifier(),
GradientBoostingClassifier(n_estimators=1000),
AdaBoostClassifier(),
GaussianNB(),
LinearDiscriminantAnalysis(),
QuadraticDiscriminantAnalysis(),
SVC(kernel='rbf', probability=True),
]
models_str=['KNN',
'LogisticRegression',
'RandomForest',
'DecisionTree',
'GBDT',
'AdaBoost',
'GaussianNB',
'LinearDiscriminantAnalysis',
'QuadraticDiscriminantAnalysis',
'SVM']
score_=[]
for name,model in zip(models_str,models):
print('Start Training:'+name)
model=model
model.fit(Xtrain,ytrain)
y_pred=model.predict(Xtest)
# pre_y_train = clf.predict(X_train)
pre_y_test = model.predict(Xtest)
# print("lr Metrics : {0}".format(precision_recall_fscore_support(ytest, pre_y_test)))
# score=model.score(Xtest,ytest)
# score_.append(str(score)[:5])
# print(name +'Score:'+str(score))
from sklearn.metrics import classification_report
from sklearn import metrics
true_false = (pre_y_test == ytest)
accuracy = np.count_nonzero(true_false) / float(len(ytest))
print()
print("accuracy is %f" % accuracy)
# precision recall f1-score
print()
print(metrics.classification_report(ytest, pre_y_test))
# 混淆矩阵
print("Confusion Matrix...")
print(metrics.confusion_matrix(ytest, pre_y_test))